In [ ]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
In [ ]:
import plotly.io as pio
pio.templates.default = "seaborn"
pio.renderers.default = "vscode+notebook"
pd.options.display.max_columns = None
# print current renderer

0. Save data to sql database¶

0.1 Create a connection to the postgres server¶

In [ ]:
# create connection to database
engine = sa.create_engine(
    # postgres connection url
    "postgresql://postgres:pass@localhost/postgres",
    # select the schema
    connect_args={"options": "-c search_path=hr"},
)
conn = engine.connect()

0.2 Save data to sql tables¶

Don't need to do this if data was already imported some other way.

In [ ]:
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# employee_df.to_sql("employee_survey_data", conn, if_exists="replace")
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# general_df.to_sql("general_data", conn, if_exists="replace")
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
# manager_df.to_sql("manager_survey_data", conn, if_exists="replace")

1. Importing the data¶

In [ ]:
employee_df = pd.read_sql_table("employee_survey_data", conn)
general_df = pd.read_sql_table("general_data", conn)
manager_df = pd.read_sql_table("manager_survey_data", conn)

#######################################
# Alternatively import from csv files #
#######################################
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
In [ ]:
display(employee_df.head())
display(general_df.head())
display(manager_df.head())
EmployeeID EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
0 1 3.0 4.0 2.0
1 2 3.0 2.0 4.0
2 3 2.0 2.0 1.0
3 4 4.0 4.0 3.0
4 5 4.0 1.0 3.0
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField EmployeeCount Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager
0 1 51 No Travel_Rarely Sales 6 2 Life Sciences 1 Female 1 Healthcare Representative Married 131160 1.0 Y 11 8 0 1.0 6 1 0 0
1 2 31 Yes Travel_Frequently Research & Development 10 1 Life Sciences 1 Female 1 Research Scientist Single 41890 0.0 Y 23 8 1 6.0 3 5 1 4
2 3 32 No Travel_Frequently Research & Development 17 4 Other 1 Male 4 Sales Executive Married 193280 1.0 Y 15 8 3 5.0 2 5 0 3
3 4 38 No Non-Travel Research & Development 2 5 Life Sciences 1 Male 3 Human Resources Married 83210 3.0 Y 11 8 3 13.0 5 8 7 5
4 5 32 No Travel_Rarely Research & Development 10 1 Medical 1 Male 1 Sales Executive Single 23420 4.0 Y 12 8 2 9.0 2 6 0 4
EmployeeID JobInvolvement PerformanceRating
0 1 3 3
1 2 2 4
2 3 3 3
3 4 2 3
4 5 3 3

2. Data Preprocessing¶

2.1 Joining the tables¶

In [ ]:
data_df = general_df.merge(employee_df, on="EmployeeID", how="left")
data_df = data_df.merge(manager_df, on="EmployeeID", how="left")

2.2 Checking for null values¶

We will check for any null values, and decide if we want to drop them or impute them.

In [ ]:
num_null_rows = np.count_nonzero(data_df.isna())
print(f"Number of rows with null values: {num_null_rows}")
print(f"Percent of rows with null values: {num_null_rows/data_df.shape[0]*100:.3f}%")
Number of rows with null values: 111
Percent of rows with null values: 2.517%

There are not very many null values so we can safely drop them without affecting the final analysis.

In [ ]:
# drop rows with missing values
data_df = data_df.dropna()

2.3 Checking for duplicate values¶

In [ ]:
print(f"Duplicate Rows: {data_df.duplicated().sum()}")
Duplicate Rows: 0

There are no duplicate rows in the dataset.

3. Exploratory Data Analysis¶

The variables in the employee and manager survey are pretty self explanatory.


For now lets focus on the general data.
We will divide the variables to numerical and categorical.
In [ ]:
ignored_columns = [
    "EmployeeID",
]

numerical_columns = [
    # "EmployeeID",
    "Age",
    # "Attrition",
    # "BusinessTravel",
    # "Department",
    "DistanceFromHome",
    # "Education",
    # "EducationField",
    "EmployeeCount",
    # "Gender",
    # "JobLevel",
    # "JobRole",
    # "MaritalStatus",
    "MonthlyIncome",
    "NumCompaniesWorked",
    # "Over18",
    "PercentSalaryHike",
    "StandardHours",
    "StockOptionLevel",
    "TotalWorkingYears",
    "TrainingTimesLastYear",
    "YearsAtCompany",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager",
    # 'EnvironmentSatisfaction',
    # 'JobSatisfaction',
    # 'WorkLifeBalance',
    # 'JobInvolvement',
    # 'PerformanceRating'
]

categorical_columns = [
    var
    for var in general_df.columns
    if var not in numerical_columns and var not in ignored_columns
]

# # make categorical columns into category type
# for col in categorical_columns:
#     data_df[col] = data_df[col].astype("category")

Create functions to visualise numerical data and categorical data

In [ ]:
def numerical_vis(data, variable):
    from plotly.subplots import make_subplots
    hist_fig = px.histogram(data, x=variable, nbins=20)

    box_fig = px.box(data, y=variable)

    final_fig = make_subplots(rows=1, cols=2)
    final_fig.add_trace(hist_fig.data[0], row=1, col=1)
    final_fig.add_trace(box_fig.data[0], row=1, col=2)

    # title
    final_fig.update_layout(title_text=f"Histogram and Boxplot of {variable}")
    
    final_fig.show()
    # fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # # hist plot
    # sns.histplot(x=data[variable], ax=ax[0], kde=True)
    # # box plot
    # sns.boxplot(x=data[variable], ax=ax[1])
    # # titles
    # ax[0].set_title(f"Histogram of {variable}")
    # ax[1].set_title(f"Boxplot of {variable}")
    # plt.show()


def categorical_vis(data, variable):
    # fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # # pie plot
    # data[variable].value_counts().plot(kind="pie", ax=ax[0], autopct="%1.1f%%")

    # # bar chart
    # sns.countplot(x=data[variable], ax=ax[1])
    # # titles
    # ax[0].set_title(f"Pie chart of {variable}")
    # ax[1].set_title(f"Bar chart of {variable}")
    pie_fig = px.pie(data, names=variable)
    
    bar_fig = px.histogram(data, x=variable, color=variable)

    final_fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'xy'}]])
    final_fig.add_trace(pie_fig.data[0], row=1, col=1)

    for index in range(len(bar_fig.data)):
        final_fig.add_trace(bar_fig.data[index], row=1, col=2)
    # barmode
    final_fig.update_layout(barmode='stack')
    # bargap
    final_fig.update_layout(bargap=0.1)
    # title
    final_fig.update_layout(title_text=f"Distribution of {variable}")
    # center title
    final_fig.update_layout(title_x=0.5)

    # hide legend
    final_fig.update_layout(showlegend=False)
    # title font size
    final_fig.update_layout(title_font_size=20)


    final_fig.show()


categorical_vis(data_df, "Attrition")

3.1 Overview of Numerical variables¶

In [ ]:
for var in numerical_columns:
    numerical_vis(general_df, var)

From the plots we can see a quick overview of the distribution of the variables.


The main thing we notice here is that StandardHours and EmployeeCount only have 1 value. As such they likely have no effect on Attrition, and we can safely drop these columns.
In [ ]:
data_df = data_df.drop(columns=["EmployeeCount", "StandardHours"])
# remove columns from numerical columns
numerical_columns.remove("EmployeeCount")
numerical_columns.remove("StandardHours")

data_df.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked Over18 PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating
0 1 51 No Travel_Rarely Sales 6 2 Life Sciences Female 1 Healthcare Representative Married 131160 1.0 Y 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3
1 2 31 Yes Travel_Frequently Research & Development 10 1 Life Sciences Female 1 Research Scientist Single 41890 0.0 Y 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4
2 3 32 No Travel_Frequently Research & Development 17 4 Other Male 4 Sales Executive Married 193280 1.0 Y 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3
3 4 38 No Non-Travel Research & Development 2 5 Life Sciences Male 3 Human Resources Married 83210 3.0 Y 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3
4 5 32 No Travel_Rarely Research & Development 10 1 Medical Male 1 Sales Executive Single 23420 4.0 Y 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3

3.2 Overview of categorical variables¶

In [ ]:
for var in categorical_columns:
    categorical_vis(data_df, var)

The distribution of the categorical variables can be seen above.


All of the workers seem to be over 18, so this metric will not have an affect on Attrition. We can safely drop this column too.
In [ ]:
data_df = data_df.drop(columns=["Over18"])
# remove columns from categorical columns
categorical_columns.remove("Over18")

3.1 Data Correlation¶

We will change binary categorical variables to 1, 0 and ordered categorical data to numbers

In [ ]:
# ordered categorical data to numerical
data_df["Attrition"] = data_df["Attrition"].map({"Yes": 1, "No": 0})
# data_df["Over18"] = data_df["Over18"].map({"Y": 1, "N": 0})
data_df["BusinessTravel"] = data_df["BusinessTravel"].map(
    {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
)
data_df["BusinessTravel"].value_counts()
data_df.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating
0 1 51 0 1 Sales 6 2 Life Sciences Female 1 Healthcare Representative Married 131160 1.0 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3
1 2 31 1 2 Research & Development 10 1 Life Sciences Female 1 Research Scientist Single 41890 0.0 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4
2 3 32 0 2 Research & Development 17 4 Other Male 4 Sales Executive Married 193280 1.0 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3
3 4 38 0 0 Research & Development 2 5 Life Sciences Male 3 Human Resources Married 83210 3.0 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3
4 5 32 0 1 Research & Development 10 1 Medical Male 1 Sales Executive Single 23420 4.0 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3
In [ ]:
# check correlation
corr = data_df.corr()
px.imshow(
    corr,
    height=1200,
    width=1200,
    # min value of the color scale -1
    zmin=-1,
    # max value of the color scale 1
    zmax=1,
    # title
    title="Correlation Matrix",
    # color scale
    color_continuous_scale=px.colors.diverging.RdYlGn,
)

3.4 Visualise relationship of each variable to attrition¶

In [ ]:
def compare_categorical_vis(data, variable):
    fig = px.histogram(
        data,
        x=variable,
        color="Attrition",
        barmode="overlay",
        title=f"Histogram of {variable} by Attrition",
    )
    # bargap
    fig.update_layout(bargap=0.1)
    fig.show()


def compare_numerical_vis(data, variable):
    hist_fig = px.histogram(data, x=variable, color="Attrition")

    
    box_fig = px.box(data, x="Attrition", y=variable, color="Attrition")

    final_fig = make_subplots(rows=1, cols=2)
    final_fig.add_trace(hist_fig.data[0], row=1, col=1)
    final_fig.add_trace(hist_fig.data[1], row=1, col=1)

    final_fig.add_trace(box_fig.data[0], row=1, col=2)
    final_fig.add_trace(box_fig.data[1], row=1, col=2)
    # hide legend
    # final_fig.update_layout(showlegend=False)
    # title
    final_fig.update_layout(title_text=f"Attrition by {variable}")
    # title center
    final_fig.update_layout(title_x=0.5)
    # barmode
    final_fig.update_layout(barmode="overlay")
    final_fig.show()

Visualuse numerical variables¶

In [ ]:
for var in numerical_columns:
    compare_numerical_vis(data_df, var)

Visualuse categorical variables¶

In [ ]:
for var in categorical_columns:
    compare_categorical_vis(data_df, var)